import numpy as np
import pandas as pd
import lightgbm as lgb
import os
from jupyterthemes import jtplot
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
jtplot.style(theme='solarizedd')
plt.rcParams['figure.figsize'] = (20.0, 10.0)
%load_ext autoreload
%autoreload 2
ROOT_DIR = '..'
DATA_DIR = os.path.join(ROOT_DIR, 'data')
DATA_RAW = os.path.join(DATA_DIR, 'raw')
DATA_INTERIM = os.path.join(DATA_DIR, 'interim')
DATA_EXTERNAL = os.path.join(DATA_DIR, 'external')
SRC_DIR = os.path.join(ROOT_DIR, 'src')
SEATTLE_CALENDAR = os.path.join(DATA_RAW, 'seattle', 'calendar.csv')
SEATTLE_LISTINGS = os.path.join(DATA_RAW, 'seattle', 'listings.csv')
SEATTLE_REVIEWS = os.path.join(DATA_RAW, 'seattle', 'reviews.csv')
SEATTLE_LISTINGS_COLS = os.path.join(
DATA_INTERIM, 'seattle', 'listings_cols_df.pkl')
import sys
sys.path.append(SRC_DIR)
sys.path.append(os.path.join(SRC_DIR, 'data'))
import preprocessing as pp
import missing_data as md
import make_dataset
# If the dataset is already processed comment the line below
# make_dataset.create_dataset()
calendar, listings, reviews = make_dataset.load_processed()
print(calendar.shape)
calendar.head()
print('The data goes from {} to {}'.format(calendar.date.min(),
calendar.date.max()))
Is every listing_id in every date?
calendar.groupby('date').count().describe()
calendar.listing_id.value_counts().describe()
calendar.shape[0] == 365 * 3818
It looks like "calendar" has one entry for every listing_id present, for every date. A better way to represent that kind of data may be to transform it to some time series.
price_ts = calendar.pivot_table(index='date',
columns='listing_id',
values='price')
print(price_ts.shape)
price_ts.head()
avail_ts = calendar.pivot_table(index='date',
columns='listing_id',
values='available',
aggfunc=lambda x: x)
print(avail_ts.shape)
avail_ts.head()
Which percentage of the properties are available during the year?
(100*avail_ts.mean(axis=1)).plot()
How does the mean price evolve in the year?
price_ts.mean(axis=1).plot()
It looks like it has a weekly seasonality, and a yearly seasonality, plus an upwards trend.
Are the price movements correlated to each other?
price_corr = price_ts.corr()
sns.heatmap(price_corr)
Let's find some clusters...
import scipy
import scipy.cluster.hierarchy as sch
def cluster_corr(ts):
"""
Clusters the time series according to their correlation.
Part of the code was taken from here:
https://github.com/TheLoneNut/CorrelationMatrixClustering/blob/master/
CorrelationMatrixClustering.ipynb
"""
X = ts.corr().fillna(0).values
d = sch.distance.pdist(X)
L = sch.linkage(d, method='complete')
ind = sch.fcluster(L, 0.5*d.max(), 'distance')
columns = [ts.columns.tolist()[i] for i in list((np.argsort(ind)))]
clmts = ts.reindex_axis(columns, axis=1)
return clmts, ind
clmts, ind = cluster_corr(price_ts)
clustered_corr = clmts.corr()
sns.heatmap(clustered_corr)
clustered_corr.isnull().sum().sum()
Let's filter the missing correlations
filtered_corr = clustered_corr.dropna(thresh=0.99).dropna(thresh=0.99, axis=1)
filtered_corr.isnull().sum().sum()
filtered_corr.shape
sns.heatmap(filtered_corr)
How do those clusters behave? Let's show the mean price in time for each cluster.
cluster_ids = np.unique(ind)
mean_dict = {'cluster_' + str(c): price_ts.loc[:,ind==c].mean(axis=1)
for c in cluster_ids}
cluster_means = pd.DataFrame(mean_dict)
cluster_means.plot()
plt.title('Means of the clusters in time')
And how many listings does each cluster have?
clusters = pd.DataFrame(ind, index=price_ts.columns, columns=['cluster'])
clusters.cluster.value_counts().sort_values().plot.pie(label='cluster_id')
print(reviews.shape)
reviews.head()
reviews.listing_id.value_counts().shape[0]
reviews.id.value_counts().shape[0]
So, the "id" column identifies a review uniquely
print(reviews.reviewer_id.value_counts().shape[0])
reviews.reviewer_id.value_counts()[:5]
The reviewer name doesn't seem like an important feature.
Let's see when did people write their reviews.
reviews.groupby('date').count().plot()
plt.title('Amount of reviews in time')
The number of reviews seems to be seasonal (annually), and also has an upward trend. Looks like a good fit for a multiplicative model.
The comments could be used for sentiment analysis.
print(listings.shape)
listings.head()
listings.neighbourhood.unique().shape[0]
There is data for 96 neighborhoods
(listings.neighbourhood.value_counts() > 20).sum()
But only 38 have more than 20 samples (I wouldn't get statistical features from the others)
listings.review_scores_rating.hist(bins=30)
listings.review_scores_accuracy.hist(bins=30)
listings.review_scores_location.hist(bins=30)
listings.review_scores_location_missing.mean()
listings.review_scores_rating_missing.mean()
(listings.description == '').mean()
OK, every listing has a desription. That could be used to cluster them by their description.